*** How worried should we be? The implications of fabricated survey data for political science
*** Figure A6. Comparing Modal Interviewer Response to Fraudulent and Clean Responses

set more off

* set directory to location of dataset in following line
cd "C:\~\Downloads\"

use "VEN_fraud_data.dta", clear

gen env2b = env2b1 if env2b2 == 999999
replace env2b = env2b2 if env2b1 == 999999
gen drk1 = drk11 if drk12 == 999999
replace drk1 = drk12 if drk11 == 999999

* diff of means
postfile qdata str32 question clean_mu clean_sd fraud_mu fraud_sd diff diff_se n1 n2 dof pval using "questions.dta", replace
foreach x of varlist l1 venl2 venl3 b1 b2 b3 b4 b6 b43 b11 b12 b13 b18 b21 b21a b32 b37 b47a venb11 venb10 venb51 venvb18 venvb19 venvb20 pr3dn pr3en polz1 ///
ros1 ros4 ing4 eff1 eff2 vengrp15 vengrp16 aoj22new media3 media4 media4b media1 media2 media2b dst1b env1c e5 e15 e3 e16 ///
d1 d2 d3 d4 d5 d6 vengrp5 vengrp6 vengrp7 vengrp8 vengrp9 vengrp10 vengrp11 vengrp12 vengrp13 vengrp14 venps1 venps2 venps3 venct1 venct2 venct3 ///
ls3 soct2 idio2 cp6 cp7 cp8 cp13 cp20 it1 aoj11 aoj12 polz1a m1 m2 drk1 env2b pn4 pn5 lib1 lib2b lib2c ///
lib4 fear11 pol1 vengrp1 vengrp2 vengrp3 vengrp4 mil10a1 mil10e1 mil10un1 mil10un2 mil10a2 mil10e2 venesc2b venesc3 ///
sd2new2 sd3new2 sd6new2 venprot10 venprot12 venprot11 mil10oas1 mil10oas2 {
	di "`x'"
	recode `x' (888888 988888 999999 99 = .)
	summ `x'
	gen `x'_rs = (`x' - r(min))/(r(max) - r(min)) * 100
	*summ `x'
	*gen `x'_rs = (`x' - r(min))/(r(max) - r(min)) * 10
	ttest `x'_rs if cem_matched == 1, by(likelyfraud) unequal
	post qdata ("`x'") (r(mu_1)) (r(sd_1)) (r(mu_2)) (r(sd_2)) (r(mu_1)-r(mu_2)) (r(se)) (r(N_1)) (r(N_2)) (r(df_t)) (r(p))
}
*
postclose qdata
* estimates of average interviewer
postfile qdata str32 question interviewer_mean using "questions_int.dta", replace
foreach x of varlist l1 venl2 venl3 b1 b2 b3 b4 b6 b43 b11 b12 b13 b18 b21 b21a b32 b37 b47a venb11 venb10 venb51 venvb18 venvb19 venvb20 pr3dn pr3en polz1 ///
ros1 ros4 ing4 eff1 eff2 vengrp15 vengrp16 aoj22new media3 media4 media4b media1 media2 media2b dst1b env1c e5 e15 e3 e16 ///
d1 d2 d3 d4 d5 d6 vengrp5 vengrp6 vengrp7 vengrp8 vengrp9 vengrp10 vengrp11 vengrp12 vengrp13 vengrp14 venps1 venps2 venps3 venct1 venct2 venct3 ///
ls3 soct2 idio2 cp6 cp7 cp8 cp13 cp20 it1 aoj11 aoj12 polz1a m1 m2 drk1 env2b pn4 pn5 lib1 lib2b lib2c ///
lib4 fear11 pol1 vengrp1 vengrp2 vengrp3 vengrp4 mil10a1 mil10e1 mil10un1 mil10un2 mil10a2 mil10e2 venesc2b venesc3 ///
sd2new2 sd3new2 sd6new2 venprot10 venprot12 venprot11 mil10oas1 mil10oas2 {
	di "`x'"
	reg `x'_rs mujer i.edad i.ed_level skintone large_city if clean_data == 1, robust
	margins, at(mujer=1 edad=1 ed_level=3 skintone=4 large_city=1) post
	post qdata ("`x'") (_b[_cons])
	gen `x'_interviewer = _b[_cons]
}
*
postclose qdata
*differences from interviewer mode
postfile qdata str32 question clean_mu_i clean_sd_i fraud_mu_i fraud_sd_i diff_i diff_se_i n1_i n2_i dof_i pval_i using "questions_intervdiffs.dta", replace
foreach x of varlist l1 venl2 venl3 b1 b2 b3 b4 b6 b43 b11 b12 b13 b18 b21 b21a b32 b37 b47a venb11 venb10 venb51 venvb18 venvb19 venvb20 pr3dn pr3en polz1 ///
ros1 ros4 ing4 eff1 eff2 vengrp15 vengrp16 aoj22new media3 media4 media4b media1 media2 media2b dst1b env1c e5 e15 e3 e16 ///
d1 d2 d3 d4 d5 d6 vengrp5 vengrp6 vengrp7 vengrp8 vengrp9 vengrp10 vengrp11 vengrp12 vengrp13 vengrp14 venps1 venps2 venps3 venct1 venct2 venct3 ///
ls3 soct2 idio2 cp6 cp7 cp8 cp13 cp20 it1 aoj11 aoj12 polz1a m1 m2 drk1 env2b pn4 pn5 lib1 lib2b lib2c ///
lib4 fear11 pol1 vengrp1 vengrp2 vengrp3 vengrp4 mil10a1 mil10e1 mil10un1 mil10un2 mil10a2 mil10e2 venesc2b venesc3 ///
sd2new2 sd3new2 sd6new2 venprot10 venprot12 venprot11 mil10oas1 mil10oas2 {
	di "`x'"
	gen `x'_diff_interview = `x'_rs - round(`x'_interviewer)
	ttest `x'_diff_interview if cem_matched == 1, by(likelyfraud) unequal
	post qdata ("`x'") (r(mu_1)) (r(sd_1)) (r(mu_2)) (r(sd_2)) (r(mu_1)-r(mu_2)) (r(se)) (r(N_1)) (r(N_2)) (r(df_t)) (r(p))	
}
*
postclose qdata

use questions.dta, clear
merge 1:1 question using questions_int.dta, nogen
merge 1:1 question using questions_intervdiffs.dta, nogen

gen abs_diffw_interviewer = abs(fraud_mu - interviewer_mean)
gen abs_diffw_genuine = abs(fraud_mu - clean_mu)

* Figure A6. Comparing Modal Interviewer Response to Fraudulent and Clean Responses
twoway (scatter abs_diffw_genuine abs_diffw_interviewer) (function y = x, ra(abs_diffw_interviewer)), scheme(plotplain) ///
ytitle("Abs(fraud mean - clean mean)") xtitle("Abs(fraud mean - modal interviewer response)") legend(off)


